1. Load Required Packages

library(ggplot2)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rlang)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(RColorBrewer)

2. Data Preparation

# Load data
load("/Users/lei/CU-Biostatistics/Data Science 2/midterm_project/mtp/dat1.RData")

# Define response variable
response_var <- "log_antibody"

# Define continuous variables
continuous_vars <- c(
  "age",      # Age of the participant
  "height",   # Height in cm
  "weight",   # Weight in kg
  "bmi",      # Body Mass Index
  "SBP",      # Systolic Blood Pressure
  "LDL",      # Low-Density Lipoprotein
  "time"      # Time measurement
)

# Define categorical variables
categorical_vars <- setdiff(names(dat1), c("id", response_var, continuous_vars))
categorical_vars <- categorical_vars[sapply(dat1[categorical_vars], function(x) is.numeric(x) || is.factor(x))]

# Convert categorical variables to factors
dat1[categorical_vars] <- lapply(dat1[categorical_vars], factor)

# Print variable types for verification
cat("Response variable:", response_var, "\n")
## Response variable: log_antibody
cat("Continuous variables:", paste(continuous_vars, collapse = ", "), "\n")
## Continuous variables: age, height, weight, bmi, SBP, LDL, time
cat("Categorical variables:", paste(categorical_vars, collapse = ", "), "\n")
## Categorical variables: gender, race, smoking, diabetes, hypertension

3. Exploratory Data Analysis

3.1 Relationship between Continuous Variables and Response Variable

# Prepare data for plotting
long_df <- dat1 %>%
  select(all_of(c(response_var, continuous_vars))) %>%
  pivot_longer(cols = all_of(continuous_vars), 
              names_to = "Variable", 
              values_to = "Value")

# Plot relationships between all continuous variables and response variable
ggplot(long_df, aes(x = Value, y = .data[[response_var]])) +
  geom_point(alpha = 0.3, color = "grey30") +
  geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "dashed") +    
  geom_smooth(method = "loess", se = FALSE, color = "blue") +                    
  facet_wrap(~ Variable, scales = "free_x", ncol = 3) +
  theme_minimal(base_size = 14) +
  labs(title = paste("Linear vs Nonlinear Relationships with", response_var),
       x = "Predictor", 
       y = paste("Log of", response_var))
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

3.2 Individual Variable Relationship Plots

# Create individual relationship plots for each continuous variable
for (var in continuous_vars) {
  p <- ggplot(dat1, aes_string(x = var, y = response_var)) +
    geom_point(alpha = 0.2, color = "grey30") +
    geom_smooth(method = "lm", se = FALSE, color = "red", size = 1) +
    geom_smooth(method = "loess", se = FALSE, color = "blue", linetype = "dashed") +
    theme_bw(base_size = 16) +
    labs(title = paste(response_var, "vs", var),
         x = var,
         y = paste("Log of", response_var))
  
  print(p)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

3.3 Correlation Analysis

# Prepare numeric variables data
num_vars <- c(response_var, continuous_vars)
df_num <- dat1[, num_vars]

# Calculate correlation matrix
cor_matrix <- round(cor(df_num, use = "complete.obs"), 2)

# Convert to long format and create heatmap
cor_melted <- melt(cor_matrix)
ggplot(cor_melted, aes(x = Var1, y = Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white",
                       midpoint = 0, limit = c(-1,1), space = "Lab",
                       name="Correlation") +
  geom_text(aes(label = value), color = "black", size = 4) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  labs(title = paste("Correlation Matrix of", response_var, "and Continuous Variables"),
       x = "", y = "")